# vllm serve /data/model/Qwen3-0.6B --max-model-len 2048 --port 10001 --served-model-name chat
export VLLM_USE_FLASHINFER_SAMPLER=0
vllm serve /data/model/Qwen3-0.6B --max-model-len 2048 --port 10003 --served-model-name chat \
    --enable-lora --lora-modules law=/home/nuaa/zjp/rlhf/output_qwen_lora/lora --gpu-memory-utilization 0.35